How complete is the CDC's COVID-19 case surveillance data for race/ethnicity at the state and county levels for people who died?

Feb 3, 2021

In [ ]:
#@title
import pandas as pd
import altair as alt
from vega_datasets import data

from google.colab import auth
auth.authenticate_user()

# Turn off the three-dot menu for Altair/Vega charts.
alt.renderers.set_embed_options(actions=False)
#%load_ext google.colab.data_table
In [2]:
#@title
def FieldAnalysis(project_id, table, field_list):
  dict = {}
  for field in field_list:
      dict[field] = [0.0, 0.0, 0.0, 0.0]
  unknowns = pd.DataFrame(dict, index=['Unknown', 'Missing', 'NA', 'Known'])
  field_series = []
  value_series = []
  percent_series = []

  for field in field_list:
    field_unknowns_query = ('''
    SELECT
      %s,
      count(*) as cases
    FROM
      %s
    WHERE
      death_yn = 'Yes'
    GROUP BY
      %s
    ''')
    query = field_unknowns_query % (field, table, field)
    field_unknowns_df = pd.io.gbq.read_gbq(query, project_id=project_id)
    field_unknowns_df.set_index(field, inplace=True)
    field_unknowns_df.index = field_unknowns_df.index.fillna('Null')

    field_display_name = {
        'cdc_case_earliest_dt': 'CDC earliest case date',
        'current_status': 'Case status',
        'res_state': 'State',
        'res_county': 'County',
        'sex': 'Sex',
        'age_group': 'Age',
        'race_ethnicity_combined': 'Race/Ethnicity'}

    missing_count = 0
    if 'Missing' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Missing'].cases
    if 'Null' in field_unknowns_df.index:
      missing_count += field_unknowns_df.loc['Null'].cases
    #if field_unknowns_df.index.isnull().any():
    #  missing_count += field_unknowns_df.loc[field_unknowns_df.index.isnull()].cases
    unknowns.loc['Missing', field] = missing_count / field_unknowns_df.cases.sum()

    if 'Unknown' in field_unknowns_df.index:
      unknowns.loc['Unknown', field] = field_unknowns_df.loc['Unknown'].cases / field_unknowns_df.cases.sum()
    if 'NA' in field_unknowns_df.index:
      unknowns.loc['NA', field] = field_unknowns_df.loc['NA'].cases / field_unknowns_df.cases.sum()
    unknowns.loc['Known', field] = 1 - (unknowns.loc['Missing', field] +
                                        unknowns.loc['Unknown', field] +
                                        unknowns.loc['NA', field])
    field_series.extend([field_display_name.get(field, field)] * 4)
    value_series.extend(['Known', 'Supressed', 'Unknown', 'Missing'])
    percent_series.extend([unknowns.loc['Known', field],
                           unknowns.loc['NA', field],
                           unknowns.loc['Unknown', field],
                           unknowns.loc['Missing', field]])
  test = pd.DataFrame.from_dict({'field': field_series,
                               'value': value_series,
                               'percent': percent_series})
  return alt.Chart(test).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='%'), title=''),
      y=alt.Y('field', sort='x', title='Field'),
      color=alt.Color('value', scale=alt.Scale(scheme='category20'), title='Value'),
      order=alt.Order('field:N'),
      tooltip=[
                  alt.Tooltip('field:N', title='Field'),
                  alt.Tooltip('value:N', title='Value'),
                  alt.Tooltip('percent:Q', format=',.0%', title='Percent'),
      ]
  )
In [3]:
#@title
CASES = 'Cases'
DATASET = 'cdc'
metric = CASES

project_id = 'msm-secure-data-1b'
cdc_table = '`%s.ndunlap_secure.cdc_restricted_access_20201231`' % project_id
date = 'DATE(2020, 12, 16)'
date_int = '20201216'
date_display_name = 'Dec 16'

# Chart settings.
total_cases_scale_max = 40000
scatter_height = 300
scatter_width = 300
map_height = 300
map_width = 450
us_states = alt.topo_feature(data.us_10m.url, 'states')
us_counties = alt.topo_feature(data.us_10m.url+"#", 'counties')

territories = ('PR', 'GU', 'VI', 'MP', 'AS')
race_ethnicity_combined_map = {
    'Asian, Non-Hispanic': 'asian_cases',
    'Black, Non-Hispanic': 'black_cases',
    'White, Non-Hispanic': 'white_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'aian_cases',
    'Hispanic/Latino': 'hispanic_cases',
    'Multiple/Other, Non-Hispanic': 'other_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'nhpi_cases',
    'Missing': 'unknown_cases',
    'Unknown': 'unknown_cases',
    'NA': 'na_cases',
}
In [4]:
#@title
crdt_query = ('''
SELECT
  State as state,
  Deaths_Total as crdt_cases,
  Deaths_Total - Deaths_Unknown as crdt_known_race_cases,
  ROUND(1 - Deaths_Unknown / Deaths_Total, 4) as crdt_known_race_cases_percent,
FROM `msm-secure-data-1b.ndunlap_secure.crdt`
WHERE
  date = %s
''' % date_int)

nyt_states_query = ('''
SELECT
  state_name,
  state_fips_code,
  deaths as nyt_cases,
  deaths as nyt_deaths
FROM `bigquery-public-data.covid19_nyt.us_states`
WHERE
  date = %s AND
  state_fips_code IS NOT NULL
''' % date)

nyt_counties_query = ('''
SELECT
  county_fips_code,
  deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = %s AND
  county_fips_code IS NOT NULL
''' % date)

cdc_states_query = ('''
SELECT
  res_state,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_state
''' % cdc_table)

cdc_counties_query = ('''
SELECT
  res_state,
  res_county,
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_county,
   res_state,
   race_ethnicity_combined
''' % cdc_table)

compare_cases_unknowns_query = ('''
SELECT
  res_state,
  race_ethnicity_combined,
  COUNT(*) as cdc_cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   res_state,
   race_ethnicity_combined
''' % cdc_table)

cdc_states_by_month_query = ('''
SELECT
  res_state,
  CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
  COUNT(*) as total_cases,
FROM
  %s
WHERE
  death_yn = 'Yes' AND
  cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
  cdc_case_earliest_dt < DATE(2020, 12, 1) AND
  res_state in ('AK', 'CA', 'CT', 'DE', 'GA', 'LA', 'MD', 'ND', 'NY', 'PA', 'RI')
GROUP BY
   1, 2
ORDER BY
   1, 2
''' % cdc_table)

cdc_states_by_month_known_or_na_query = ('''
SELECT
  res_state,
  CONCAT(EXTRACT(YEAR from cdc_case_earliest_dt), '-Q', EXTRACT(QUARTER from cdc_case_earliest_dt)) as date,
  COUNT(*) as known_or_na_cases,
FROM
  %s
WHERE
  death_yn = 'Yes' AND
  cdc_case_earliest_dt >= DATE(2020, 1, 1) AND
  cdc_case_earliest_dt < DATE(2020, 12, 1) AND
  race_ethnicity_combined != 'Unknown' AND
  race_ethnicity_combined != 'Missing'
GROUP BY
   1, 2
ORDER BY
   1, 2
''' % cdc_table)

cdc_overall_query = ('''
SELECT
  race_ethnicity_combined,
  COUNT(*) as cases
FROM
  %s
WHERE
  death_yn = 'Yes'
GROUP BY
   1
''' % cdc_table)

Background

The CDC Case Surveillance data about who died from COVID-19 has issues similar to those discussed in the case data completeness analysis.

  • Only 80% of total deaths up to Dec 16 are included (60K deaths are missing)
  • Of those deaths, 78% have known race/ethnicity (54K out of 239K cases are missing race/ethnicity)

For the 185M cases where we do know race/ethnicity, we can see the following disparities across race/ethnicity groups:

In [5]:
#@title
overall_df = pd.io.gbq.read_gbq(cdc_overall_query, project_id=project_id)
overall_df['race_ethnicity_combined'] = overall_df.race_ethnicity_combined.astype('string').str.strip()
overall_df = overall_df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
overall_df = overall_df.set_index('race_ethnicity_combined')

chart_denominator = 1000
cases_list = [overall_df.cases['hispanic_cases'] / chart_denominator,
         overall_df.cases['black_cases'] / chart_denominator,
         overall_df.cases['white_cases'] / chart_denominator,
         overall_df.cases['asian_cases'] / chart_denominator,
         overall_df.cases['nhpi_cases'] / chart_denominator,
         overall_df.cases['aian_cases'] / chart_denominator,
         overall_df.cases.sum() / chart_denominator,
]

# Population data from https://api.census.gov/data/2019/acs/acs1/profile?get=NAME,DP05_0071E,DP05_0078E,DP05_0077E,DP05_0080E,DP05_0081E,DP05_0079E,DP05_0070E&for=us:1
pop_list = [
    60481746 / chart_denominator,
    40596040  / chart_denominator,
    196789401 / chart_denominator,
    18427914  / chart_denominator,
    565473 / chart_denominator,
    2236348 / chart_denominator,
    328239523 / chart_denominator,
]
percent_list = []
for i in range(len(cases_list)):
  percent_list.append(cases_list[i] / pop_list[i])
prevalence = pd.DataFrame.from_dict({'group': [
    'Hispanic/Latino',
    'Black',
    'White',
    'Asian',
    'Native Hawaiian/Pacific Islander',
    'American Indian/Alaska Native',
    '*Total Including Unknowns*',
], 'percent': percent_list,
   'cases': cases_list,
   'population': pop_list,
})

bars = alt.Chart(prevalence).mark_bar().encode(
      x=alt.X('percent', axis=alt.Axis(format='.2%'), title=''),
      y=alt.Y('group', sort='-x', title=''),
      color=alt.Color('group', 
                      scale=alt.Scale(scheme='tableau20'),
                      title='',
                      legend=None),
      tooltip=[
                  alt.Tooltip('group:N', title='Race/Ethnicity Group'),
                  alt.Tooltip('percent:Q', format='.2%', title='Percent who died'),
                  alt.Tooltip('cases:Q', format=',.2f', title='Deaths in group (thousands)'),
                  alt.Tooltip('population:Q', format=',.0f', title='Population of group (thousands)'),
      ]
).properties(
   title='Percent of Race/Ethnicity Group who died from COVID-19 based on Incomplete CDC Data as of %s' % 'Dec 16'
)

bars.display()

#alt.concat(bars).properties(
#    title=alt.TitleParams(
#        ['Source: U.S. Census Bureau\'s American Community Survey 2019 5-year estimates for population data.'],
#        baseline='bottom',
#        dy=20,
#        orient='bottom',
#        fontWeight='normal',
#        fontSize=11
#    )
#).display()

But the chart above is based on incomplete data. For example, the CDC data says that 0% of deaths in California were Hispanic/Latino people, whereas the California public health website reports that Hispanics/Latinos made up 46% of deaths (17K people) as of Jan 27. If we added 15K Hispanic/Latino deaths from California, the total percentage of Hispanic/Latinos who died from COVID-19 would increase from 0.04% to 0.07%.

If all 54K deaths with missing race/ethnicity were Hispanic/Latino people, the percent of Hispanic/Latinos in the U.S. who died from COVID-19 would go from 0.04% to 0.13% — a 3x increase. If all 54K deaths with missing race/ethnicity were Black people, the percent of Black people who died from COVID-19 would go from 0.08% to 0.20% — a 2.5x increase. While both of these scenarios are unlikely, they show why missing race/ethnicity data is preventing us from fully understanding and addressing the disparities in the COVID-19 pandemic in the U.S.

Overview

The goal of this analysis is to assess the completeness of the CDC's Restricted Access data and its feasibility in examining disparities in race/ethnicity for COVID-19 deaths at the state and county levels. For a case data analysis with more background information, see this report.

The top-level data completeness findings are:

  1. Data Overview: The field indicating if the person died or not is only known for 48% of cases. Race/ethnicity was known for 78% of cases where the person died, as opposed to 98%-100% for all the other fields below.
In [6]:
#@title
field_list = ['cdc_case_earliest_dt', 'current_status', 'res_state', 'res_county', 'sex', 'age_group', 'race_ethnicity_combined']
project_id = 'msm-secure-data-1b'
table = '`msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20201231`'
FieldAnalysis(project_id, table, field_list).display()
  1. Total Case Counts: The CDC data contains 80% of the total cases reported in more reliable data sources on the same data. There's high variability at the state level with 10 states reporting fewer than 10% of cases and several reporting 0%.
  2. Cases with Race/Ethnicity: Race/ethnicity data is available for 78% of cases in the CDC data compared to 93% in the Covid Racial Data Tracker. The Covid Racial Tracker has less variability with all but one state reporting at least 70% of deaths with race/ethnicity.

At the end, we also evaluate an alternative source for deaths data that comes from death certificates reported in the the CDC Provisional Deaths Counts data. The CDC Provisional Deaths data is more complete within each county it reports than the case data, but the Provisional Deaths data only contains data for counties with 100 or more deaths, which was 579 counties on Jan 27 vs. the 2,328 counties in the CDC case data on Dec 16. The provisional deaths data is a good alternative to the case data if you are more interested in counties with high populations that account for the most deaths rather than a comprehensive view of deaths across the U.S. that includes rural areas with smaller populations.

Recommendations on how to improve state and county data:

  • The data on deaths is even less complete than that for cases because many states and counties are not reporting whether people who had COVID-19 died or not. This is compounded on top of the separate issues with race/ethnicity data that affect the overall case data.

What we didn't include in this report:

Completeness Analysis

Total Death Counts

Baseline: NYT vs. CRDT

To get a baseline of how much we could expect the CDC death counts to match the CRDT or NYT, we can see how closely the CRDT and NYT match each other. Each dot below is a state (hover to see details), and the black line shows where the NYT and CRDT death counts are equal.

In [7]:
#@title
def CreateScatterPlot(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
    axis_format = ',.0f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'
    axis_format = '.0%'

  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))
  plot = alt.Chart(chart_df).mark_circle(size=60).encode(
      alt.X(fields_dict['x']['name'] + ':Q', axis=alt.Axis(title=fields_dict['x']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      alt.Y(fields_dict['y']['name'] + ':Q', axis=alt.Axis(title=fields_dict['y']['title'], format=axis_format),
          scale=alt.Scale(domain=(0, scale_max))
      ),
      color=alt.Color(fields_dict['percent']['name'],
                      type='quantitative',
                      scale=alt.Scale(scheme=scale_scheme,
                                      reverse=scale_reverse,
                                      domain=scale_domain,
                                      clamp=True),
                      legend=alt.Legend(format=legend_format),
                      title=metric_type.capitalize()),
      tooltip=tooltips,
  ).properties(
      height=height,
      width=width,
  )
  if metric_type == 'ratio':
    plot.interactive()

  line = pd.DataFrame({
      'x': [0, scale_max],
      'y': [0, scale_max],
  })

  if metric_type == 'ratio':
    line_plot = alt.Chart(line).mark_line(color='black').encode(
        x='x',
        y='y',
    )
  elif metric_type == 'percent':
    line_plot = (
        alt.Chart(pd.DataFrame({'x': [.5]})).mark_rule().encode(y='x') +
        alt.Chart(pd.DataFrame({'y': [.5]})).mark_rule().encode(x='y')
    )
  # Add interative for concatenating due to https://github.com/altair-viz/altair/issues/2010.
  scatter = (plot + line_plot).properties(
      title=title,
      height=height,
      width=width,
  ).interactive()
  return scatter

def CreateMap(
    chart_df, fields_dict, title, scale_max, height, width, geo, metric_type):
  
  geo_field = 'state'
  geo_field_display_name = 'State'
  fips_code = 'state_fips_code'
  topo_feature = us_states
  if geo == 'county':
    geo_field = 'state_county'
    geo_field_display_name = 'County'
    fips_code = 'county_fips'
    topo_feature = us_counties

  if metric_type == 'ratio':
    scale_scheme = 'blueorange'
    scale_reverse = True
    scale_domain = [0, 2]
    legend_format = '.1f'
  elif metric_type == 'percent':
    scale_scheme = 'redyellowblue'
    scale_reverse = False
    scale_domain = [0, 1]
    legend_format = '.0%'

  highlight = alt.selection_single(on='mouseover', fields=['id', fips_code], empty='none')
  tooltips = [alt.Tooltip(geo_field + ':N', title=geo_field_display_name)]
  for field in ('y', 'x', 'percent'):
    tooltips.append(alt.Tooltip(
        fields_dict[field]['name'] + ':Q',
        format=fields_dict[field]['format'],
        title=fields_dict[field]['title'],
    ))

  field_names = [geo_field]
  field_names.extend([fields_dict[field]['name'] for field in fields_dict])
  plot = alt.Chart(topo_feature).mark_geoshape(
        stroke='white',
        strokeOpacity=.2,
        strokeWidth=1
    ).project(
      type='albersUsa'
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(chart_df, fips_code, field_names)
    ).encode(
        alt.Color(fields_dict['percent']['name'],
                  type='quantitative',  
                  legend=alt.Legend(format=legend_format),
                  scale=alt.Scale(scheme=scale_scheme,
                                  reverse=scale_reverse,
                                  domain=scale_domain,
                                  clamp=True,
                                  ),
                  title=metric_type.capitalize()),
         tooltip=tooltips
    ).add_selection(
        highlight,
    )

  states_outline = alt.Chart(us_states).mark_geoshape(stroke='white', strokeWidth=1.5, fillOpacity=0, fill='white').project(
        type='albersUsa'
  )

  states_fill = alt.Chart(us_states).mark_geoshape(
        fill='silver',
        stroke='white'
  ).project('albersUsa')

  layered_map = alt.layer(states_fill, plot, states_outline).properties(
        height=height,
        width=width,
        title=title,
  )
  return layered_map

def CreateScatterPlotAndMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, map_width, geo, metric_type):
  scatter = CreateScatterPlot(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, scatter_width, geo, metric_type)
  map = CreateMap(
    chart_df, fields_dict, title, total_cases_scale_max, scatter_height, map_width, geo, metric_type)
  return (scatter | map).configure_view(
       strokeWidth=0,
   ).configure_mark(
       stroke='grey'
   ).configure_legend(
       gradientLength=scatter_height - 50
   )

def PrintSummaryStats(chart_df, field='percent'):
  below_15 = len(chart_df[chart_df[field] < .85]) / len(chart_df)
  above_15 = len(chart_df[chart_df[field] > 1.15]) / len(chart_df)
  print('between +/-15%: ', round(1 - below_15 - above_15, 2))
  below_50 = len(chart_df[chart_df[field] < .5]) / len(chart_df)
  above_50 = len(chart_df[chart_df[field] > 1.5]) / len(chart_df)
  print('between +/-50%: ', round(1 - below_50 - above_50, 2))
  print('< than .50: ', len(chart_df[chart_df[field] < .5]))
  print('> than 1.50: ', len(chart_df[chart_df[field] > 1.5]))
  print(chart_df[field].describe())
In [8]:
#@title
states_to_fips = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'AS': 3, 'CA': 6, 'CO': 8, 'CT': 9, 'DC': 11, 'DE': 10, 'FL': 12, 'GA': 13, 'GU': 14, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23, 'MD': 24, 'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30, 'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NYC': 36, 'NC': 37, 'ND': 38, 'OH': 39, 'OK': 40, 'OR': 41, 'PA': 42, 'PR': 43, 'RI': 44, 'SC': 45, 'SD': 46, 'TN': 47, 'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51, 'VI': 52, 'WA': 53, 'WV': 54, 'WI': 55, 'WY': 56, 'AS': 60, 'GU': 66, 'MP': 69, 'PR': 72, 'VI': 78}

crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)
crdt_df.set_index('state', inplace=True)

nyt_states_df = pd.io.gbq.read_gbq(nyt_states_query, project_id=project_id)
nyt_states_df.state_fips_code.unique()
nyt_territories = ('Puerto Rico', 'Guam', 'Virgin Islands', 'Northern Mariana Islands', 'American Samoa')
for territory in nyt_territories:
  nyt_states_df = nyt_states_df[nyt_states_df.state_name != territory]
nyt_states_df['state_fips_code'] = nyt_states_df.state_fips_code.astype(int)
nyt_states_df.set_index('state_fips_code', inplace=True)

crdt_df.reset_index(inplace=True)
crdt_df['state_fips_code'] = crdt_df.state
crdt_df = crdt_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_df.set_index('state_fips_code', inplace=True)
nyt_crdt_merged_df = nyt_states_df.join(crdt_df, on="state_fips_code", how='inner', lsuffix='_left', rsuffix='_right')

nyt_crdt_merged_df['percent'] = round(nyt_crdt_merged_df.nyt_cases / nyt_crdt_merged_df.crdt_cases, 2)
nyt_crdt_merged_df
nyt_crdt_merged_df.reset_index(inplace=True)


below_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent < .85]) / len(nyt_crdt_merged_df)
above_15 = len(nyt_crdt_merged_df[nyt_crdt_merged_df.percent > 1.15]) / len(nyt_crdt_merged_df)
#print('between +/-15%: ', round(1 - below_15 - above_15, 2))
#nyt_crdt_merged_df.percent.describe()
In [9]:
#@title
nyt_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of NYT to CRDT'},
}
nyt_crdt_title = 'Ratio of NYT to CRDT Deaths by State as of %s' % date_display_name

CreateScatterPlotAndMap(
    nyt_crdt_merged_df, nyt_crdt_fields_dict, nyt_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()

States: CDC vs. CRDT

We can see below that the CDC death counts differ from the CRDT death counts much more drastically than the NYT did.

In [10]:
#@title
cdc_states_df = pd.io.gbq.read_gbq(cdc_states_query, project_id=project_id)
cdc_states_df.rename(columns={'res_state': 'state'}, inplace=True)
cdc_states_df.set_index('state', inplace=True)

crdt_df = pd.io.gbq.read_gbq(crdt_query, project_id=project_id)

for territory in territories:
  crdt_df = crdt_df[crdt_df.state != territory]

crdt_df.set_index('state', inplace=True)
cdc_crdt_merged_df = cdc_states_df.join(crdt_df, on="state", how='right', lsuffix='_left', rsuffix='_right')
cdc_crdt_merged_df.reset_index(inplace=True)
cdc_crdt_merged_df['state_fips_code'] = cdc_crdt_merged_df.state
cdc_crdt_merged_df = cdc_crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
cdc_crdt_merged_df['percent'] = round(cdc_crdt_merged_df.cdc_cases / cdc_crdt_merged_df.crdt_cases, 4)

# PrintSummaryStats(cdc_crdt_merged_df)
In [11]:
#@title
cdc_crdt_fields_dict = {
    'x': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
cdc_crdt_title = 'Ratio of CDC to CRDT Deaths by State as of %s' % date_display_name

CreateScatterPlotAndMap(
    cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, scatter_height, scatter_width, map_width, 'state', 'ratio'
).display()

Counties: CDC vs. NYT

In [12]:
#@title
# CDC vs. NYT county

df = pd.io.gbq.read_gbq(cdc_counties_query, project_id=project_id)
for territory in territories:
  df = df[df.res_state != territory]

project_id = 'msm-secure-data-1b'
df_county_fips_map = pd.io.gbq.read_gbq(f'''
SELECT
*
FROM
  `msm-secure-data-1b.ndunlap_secure.county_fips_mapping`
''', project_id=project_id)

df_county_fips_map.cdc_county = df_county_fips_map.cdc_county.str.lower()
df_county_fips_map['state_county'] = df_county_fips_map.state + '-' + df_county_fips_map.cdc_county
df_county_fips_map['state_county'] = df_county_fips_map.state_county.astype('string').str.strip()
df_county_fips_map.set_index('state_county', inplace=True)
In [13]:
#@title
# Concatenate the state and county names because county names are not unique across states.
df.res_county = df.res_county.str.lower()
df['state_county'] = df.res_state + '-' + df.res_county
df['state_county'] = df.state_county.astype('string').str.strip()
df.set_index('state_county', inplace=True)
df['race_ethnicity_combined'] = df.race_ethnicity_combined.astype('string').str.strip()

df = df.replace(to_replace={'race_ethnicity_combined': race_ethnicity_combined_map})
In [14]:
#@title
mismatches_df = df.join(df_county_fips_map, on="state_county", how='outer', lsuffix='_left', rsuffix='_right')
mismatches_df = mismatches_df[mismatches_df.county_fips.isna()]
mismatches_df = mismatches_df[mismatches_df.res_state != 'NA']
mismatches_df = mismatches_df[mismatches_df.res_state != 'Unknown']
mismatches_df = mismatches_df[mismatches_df.res_county != 'na']
mismatches_df = mismatches_df[mismatches_df.res_county != 'unknown']
#print(mismatches_df.cases.sum())
#print('vs. 60363 with NULL county_fips_code')
# SELECT 
#count(*) as total_cases,
#FROM `msm-secure-data-1b.ndunlap_secure.cdc_restricted_access_20201231`
#WHERE county_fips_code IS NULL
In [15]:
#@title
merged_df = df.join(df_county_fips_map, on="state_county", how='inner', lsuffix='_left', rsuffix='_right')

# Create a crosstab table with rows = counties, columns = race_ethnicity_combined.
crosstab_df = pd.crosstab(merged_df['county_fips'], merged_df.race_ethnicity_combined, values=merged_df.cases, aggfunc=sum,
                          margins=True,
                          margins_name='total_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['county_fips'] = crosstab_df.county_fips.astype(int)
crosstab_df['total_known_cases'] = crosstab_df['total_cases'] - crosstab_df.na_cases.fillna(0) - crosstab_df.unknown_cases.fillna(0)
In [16]:
#@title
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
df_acs_name_lookup = pd.io.gbq.read_gbq(f'''
SELECT
  state,
  county,
  county_fips,
  total_pop
FROM
  `msm-internal-data.ipums_acs.acs_2019_5year_county`
''', project_id=project_id)

df_acs_name_lookup['state_county'] = df_acs_name_lookup.county.astype('string').str.strip() + ', ' + df_acs_name_lookup.state.astype('string').str.strip()
df_acs_name_lookup.drop(columns=['state', 'county'], inplace=True)
df_acs_name_lookup.set_index('county_fips', inplace=True)

county_chart_df = crosstab_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_df.county_fips = county_chart_df.county_fips.astype(int)

#print(len(county_chart_df))
#print(county_chart_df.total_pop.sum())
#print(county_chart_df.total_pop.sum() / 324697795)  # Population covered in these counties
#print(0.55 * 324697795) # NYT population
In [17]:
#@title

nyt_counties_df = pd.io.gbq.read_gbq(nyt_counties_query, project_id=project_id)
nyt_counties_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_df.county_fips.unique()
nyt_counties_df['county_fips'] = nyt_counties_df.county_fips.astype(int)
nyt_counties_df.set_index('county_fips', inplace=True)

county_chart_df.set_index('county_fips', inplace=True)
nyt_merged_df = county_chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_merged_df = nyt_merged_df.reset_index()
nyt_merged_df['percent'] = round(nyt_merged_df.total_cases / nyt_merged_df.nyt_cases, 2)

#PrintSummaryStats(nyt_merged_df)

We can do the same analysis at the county level using the CDC vs. NYT data.

Each dot is a county (hover to see details). We show all 2,323 counties in the CDC data that were also in the NYT data on the left and zoom in on the smaller counties on the right. Note that the five counties in New York City are missing because the NYT combined them into one region.

In [18]:
#@title
cdc_nyt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC Deaths to NYT Deaths by County as of Dec 16'
zoom_cdc_nyt_title = 'Zoom in on counties with up to 3,000 Deaths'

scatter = CreateScatterPlot(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, 10000, scatter_height, scatter_width, 'county', 'ratio'
)
zoom_scatter = CreateScatterPlot(
    nyt_merged_df, cdc_nyt_fields_dict, zoom_cdc_nyt_title, 3000, scatter_height, scatter_width, 'county', 'ratio'
)

(scatter | zoom_scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [19]:
#@title
cdc_nyt_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_title = 'Ratio of CDC Deaths to NYT Deaths by County as of Dec 16'

cdc_nyt_map = CreateMap(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height, map_width, 'county', 'ratio'
)
cdc_crdt_map = CreateMap(
    cdc_crdt_merged_df, cdc_crdt_fields_dict, cdc_crdt_title, total_cases_scale_max, map_height, map_width, 'state', 'ratio'
)

(cdc_crdt_map | cdc_nyt_map).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

Notes:

  • The legend only goes to 2.0, and all counties with a larger ratio are shown in the same dark blue color.
  • States can have cases but no counties with cases because county names can be suppressed for privacy reasons.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

Deaths with Race/Ethnicity

States and Counties: CDC

In [20]:
#@title
states_df = pd.io.gbq.read_gbq(compare_cases_unknowns_query, project_id=project_id)
for state in ('Unknown', 'NA', 'OCONUS'):
  states_df = states_df[states_df.res_state != state]

states_df['race_ethnicity_combined'] = states_df.race_ethnicity_combined.astype('string').str.strip()
states_df = states_df.replace(to_replace={'race_ethnicity_combined': {
    'Asian, Non-Hispanic': 'cdc_known_cases',
    'Black, Non-Hispanic': 'cdc_known_cases',
    'White, Non-Hispanic': 'cdc_known_cases',
    'American Indian/Alaska Native, Non-Hispanic': 'cdc_known_cases',
    'Hispanic/Latino': 'cdc_known_cases',
    'Multiple/Other, Non-Hispanic': 'cdc_known_cases',
    'Native Hawaiian/Other Pacific Islander, Non-Hispanic': 'cdc_known_cases',
    'Missing': 'cdc_unknown_cases',
    'Unknown': 'cdc_unknown_cases',
    'NA': 'cdc_na_cases',
    }})
states_df.rename(columns={'res_state': 'state'}, inplace=True)
In [21]:
#@title
crosstab_df = pd.crosstab(states_df['state'], states_df.race_ethnicity_combined, values=states_df.cdc_cases, aggfunc=sum,
                          margins=True,
                          margins_name='cdc_cases'
)
# Have to reset_index() to go from pandas multi-index to single index.
crosstab_df = crosstab_df.reset_index()
crosstab_df.drop(axis=0, index=len(crosstab_df) - 1, inplace=True)
crosstab_df['cdc_known_or_na_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df['cdc_known_cases'] = crosstab_df['cdc_cases'] - crosstab_df.cdc_na_cases.fillna(0) - crosstab_df.cdc_unknown_cases.fillna(0)
crosstab_df

crdt_merged_df = crosstab_df.join(crdt_df, on="state", how='inner', lsuffix='_left', rsuffix='_right')
crdt_merged_df.reset_index(inplace=True)
crdt_merged_df['state_fips_code'] = crdt_merged_df.state
crdt_merged_df = crdt_merged_df.replace(to_replace={'state_fips_code': states_to_fips})
crdt_merged_df['cdc_known_cases_percent'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['cdc_known_or_na_cases_percent'] = round(crdt_merged_df.cdc_known_or_na_cases / crdt_merged_df.cdc_cases, 4)
crdt_merged_df['percent'] = round(crdt_merged_df.cdc_cases / crdt_merged_df.crdt_cases, 4)
crdt_merged_df['percent_known_cases'] = round(crdt_merged_df.cdc_known_cases / crdt_merged_df.crdt_known_race_cases, 4)

crdt_merged_df_no_ny = crdt_merged_df[crdt_merged_df.state != 'NY']
#PrintSummaryStats(crdt_merged_df_no_ny)

When evaluating the percent of deaths that report on race/ethnicity in the CDC data, we also need to consider the 3% of overall deaths with race/ethnicity that were suppressed due to privacy reasons. We should give states and counties credit for reporting race/ethnicity data for those deaths even if we aren't able to use it due to privacy suppression. Below, the maps on the top left shows the percent of deaths with known race/ethnicity and the map on the top right shows the percent of deaths with known or suppressed race/ethnicity. The maps on the bottom show the same information at the county level.

In [22]:
#@title

chart_df = county_chart_df.copy(deep=True)
chart_df.reset_index(inplace=True)
chart_df.county_fips = chart_df.county_fips.astype(int)
chart_df['percent_known_cases'] = round(chart_df.total_known_cases / chart_df.total_cases, 2)
chart_df['total_known_or_na_cases'] = chart_df.total_known_cases.fillna(0) + chart_df.na_cases.fillna(0)
chart_df['percent_known_or_na_cases'] = round(chart_df.total_known_or_na_cases / chart_df.total_cases, 2)
In [23]:
#@title
cdc_known_state_fields_dict = {
    'x': {'name': 'cdc_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

cdc_known_state_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_state_map = CreateMap(
    crdt_merged_df, cdc_known_state_fields_dict, cdc_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

cdc_known_or_na_state_fields_dict = {
    'x': {'name': 'cdc_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
    'y': {'name': 'cdc_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'cdc_known_or_na_cases_percent', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_state_title = 'CDC Deaths with Known+Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_state_map = CreateMap(
    crdt_merged_df, cdc_known_or_na_state_fields_dict, cdc_known_or_na_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(cdc_known_state_map | cdc_known_or_na_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [24]:
#@title
cdc_known_county_fields_dict = {
    'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_title = 'CDC Deaths with Known Race/Ethnicity as of %s' % date_display_name
cdc_known_county_map = CreateMap(
    chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

cdc_known_or_na_county_fields_dict = {
    'x': {'name': 'total_known_or_na_cases', 'format': ',', 'title': 'Known or suppressed race/ethnicity deaths'},
    'y': {'name': 'total_cases', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_or_na_cases', 'format': '.0%', 'title': 'Percent known or suppressed deaths'},
}
cdc_known_or_na_county_title = 'CDC Deaths with Known+Suppressed Race/Ethnicity as of %s' % date_display_name
cdc_known_or_na_county_map = CreateMap(
    chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

(cdc_known_county_map | cdc_known_or_na_county_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [25]:
#@title
#PrintSummaryStats(crdt_merged_df, field='cdc_known_cases_percent')
#PrintSummaryStats(crdt_merged_df, field='cdc_known_or_na_cases_percent')
#tuple(crdt_merged_df[crdt_merged_df.cdc_known_or_na_cases_percent <= .5].state)

Note: A larger version of the county maps for hovering over smaller counties is available in the Appendix.

States: CDC vs. CRDT

How does the CDC data compare to the CRDT data, which is the most up-to-date aggregate data we have for race/ethnicity at the state level? Overall, 93% of the deaths in the CRDT data have known race/ethnicity compared to 78% in the CDC data (81% with suppressed data).

We may even be undercounting the 93% of deaths with known race/ethnicity in the CRDT data because of the non-standard ways that each state reports on race/ethnicity, as described in this Covid Racial Data Tracker analysis. If a state uses a combined race/ethnicity field, then it's a straightforward comparison to the CDC's combined race/ethnicity field. If a state uses separate fields for race/ethnicity, then we still use the number of people with known race within each state because all of the race categories will also contain Hispanic/Latino people. We could potentially be undercounting the number of people with known race/ethnicity in the CRDT if there are people who have unknown race but known ethnicity. If we adjusted the numbers in those cases, it would make the CRDT percentages look even better in comparison to the CDC data.

In [26]:
#@title
crdt_known_state_fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'crdt_cases', 'format': ',', 'title': 'CRDT deaths'},
    'percent': {'name': 'crdt_known_race_cases_percent', 'format': '.0%', 'title': 'Percent known deaths'},
}

crdt_known_state_title = 'CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name
crdt_known_map = CreateMap(
    cdc_crdt_merged_df, crdt_known_state_fields_dict, crdt_known_state_title, total_cases_scale_max, map_height, map_width, 'state', 'percent'
)

(crdt_known_map | cdc_known_state_map).configure(
    padding={"left": 0, "top": 5, "right": 0, "bottom": 5}
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()
In [27]:
#@title
fields_dict = {
    'x': {'name': 'crdt_known_race_cases', 'format': ',', 'title': 'CRDT known race/ethnicity deaths'},
    'y': {'name': 'cdc_known_cases', 'format': ',', 'title': 'CDC known race/ethnicity deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.2f', 'title': 'Ratio of CDC to CRDT'},
}
title = 'Ratio of CDC to CRDT Deaths with Known Race/Ethnicity as of %s' % date_display_name

CreateScatterPlotAndMap(
    crdt_merged_df, fields_dict, title, 40000, scatter_height, scatter_width, map_width - 5, 'state', 'ratio'
).display()
In [28]:
#@title
#print(crdt_merged_df.crdt_known_race_cases.sum() / crdt_merged_df.crdt_cases.sum())
#PrintSummaryStats(cdc_crdt_merged_df, field='crdt_known_race_cases_percent')

How to Improve State and County Data

How can states and counties improve their completeness for race/ethnicity data, especially when compared to the more reliable and up-to-date aggregate data that comes from public health websites, as collected by the CRDT and NYT?

There are two ways in which states can improve the data they send to the CDC:

  1. Increase the total deaths reported to get closer to the aggregate data.
  2. Increase the percentage of deaths reported with known race/ethnicity to get closer to 100%.

In the Total Death Counts section above, we identified the states and counties with the biggest discrepancies relative to aggregate data. In the Deaths with Race/Ethnicity section, we looked at the percentage of deaths within each state and county that have race/ethnicity data.

The charts below show those two components together; the scatterplots show (1) the discrepancy vs. CRDT/NYT total death counts on the y-axis, and (2) the percentage of deaths with known or suppressed race/ethnicity on the x-axis. The colors of the dots and on the map show the product of those two numbers, which is the percentage of total deaths in the CRDT/NYT that the CDC data has with known or suppressed race/ethnicity.

The scatterplots below can help us diagnose the issues in each state or county:

  • Bottom left quadrant: Low percentage of deaths reported, low reporting of race/ethnicity.
  • Top left quadrant: Mid-to-high percentage of deaths reported, low reporting of race/ethnicity.
  • Bottom right quadrant: Low percentage of deaths reported, mid-to-high reporting of race/ethnicity.
  • Top right quadrant: Mid-to-high percentage of deaths reported, mid-to-high reporting of race/ethnicity.
In [29]:
#@title
nyt_cdc_known_merged_df = chart_df.join(nyt_counties_df, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
nyt_cdc_known_merged_df.reset_index(inplace=True)
nyt_cdc_known_merged_df['percent'] = round(nyt_cdc_known_merged_df.total_cases / nyt_cdc_known_merged_df.nyt_cases, 2)
In [30]:
#@title
crdt_merged_df['percent_max_100'] = crdt_merged_df.percent.clip(upper=1)
crdt_merged_df['percent_reccs'] = crdt_merged_df.percent_max_100 * crdt_merged_df.cdc_known_or_na_cases_percent
state_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of CRDT total deaths'},
    'x': {'name': 'cdc_known_or_na_cases_percent', 'format': '.0%', 'title': 'CDC percent with known or suppressed race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of CRDT total with race/ethnicity'},
}
state_reccs_title = 'CDC Deaths: Total Deaths x Race/Ethnicity as of %s' % date_display_name

scatter = CreateScatterPlotAndMap(
    crdt_merged_df, state_reccs_fields_dict, state_reccs_title, 1, scatter_height, scatter_width, map_width, 'state', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [31]:
#@title
nyt_cdc_known_merged_df['percent_max_100'] = nyt_cdc_known_merged_df.percent.clip(upper=1)
nyt_cdc_known_merged_df['percent_reccs'] = nyt_cdc_known_merged_df.percent_max_100 * nyt_cdc_known_merged_df.percent_known_or_na_cases
county_reccs_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
    'x': {'name': 'percent_known_or_na_cases', 'format': '.0%', 'title': 'CDC percent with known or suppressed race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_title = state_reccs_title = 'CDC Deaths: Total Deaths x Race/Ethnicity as of %s' % date_display_name


scatter = CreateScatterPlotAndMap(
    nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Notes:

  • All states or counties with > 100% of the total deaths in the CRDT or NYT data were capped at 100%.
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.
In [32]:
#@title
cdc_states_by_month_df = pd.io.gbq.read_gbq(cdc_states_by_month_query, project_id=project_id)
cdc_states_by_month_df.set_index(keys=['res_state', 'date'], inplace=True)

cdc_states_by_month_known_or_na_df = pd.io.gbq.read_gbq(cdc_states_by_month_known_or_na_query, project_id=project_id)
cdc_states_by_month_known_or_na_df.set_index(keys=['res_state', 'date'], inplace=True)

cdc_known_over_time = cdc_states_by_month_df.join(cdc_states_by_month_known_or_na_df, how='left')
cdc_known_over_time['percent_known_or_na'] = round(cdc_known_over_time.known_or_na_cases / cdc_known_over_time.total_cases, 2)
cdc_known_over_time.reset_index(inplace=True)
In [33]:
#@title
base = alt.Chart(cdc_known_over_time).mark_line(point=True).encode(
    x=alt.X('date', title='CDC earliest report date', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('percent_known_or_na', title='Percent unknown or suppressed race/ethnicity', axis=alt.Axis(format='%')),
    color=alt.Color('res_state', scale=alt.Scale(scheme='category20'), title='State')
).properties(
    title='States with less than 50% of Cumulative Cases with Known or Suppressed Race/Ethnicity',
    height=map_height,
    width=map_width
)
#base.display()

What about Data from Death Certificates?

The CDC has an alternative public source for death data with race/ethnicity at the county level, which comes from a different source: death certificates. This data is more complete on all counts except for the number of counties it contains, which is about 1/4 of the number of counties in the CDC case data. The reason for this difference is that the CDC Provisional Deaths data only includes counties with 100 or more deaths. The data is also presented as a cumulative total updated each week as opposed to having a case report date as in the CDC case data.

In [34]:
#@title
# Manually update these fields based on the latest CDC data.
row_names = [
    'Update frequency',
    'Data as of date',
    'Deaths in data as of date',
    'Deaths in CTP as of date',
    '(as a % of CTP)',
    'Number of counties',
    '(as a % of all counties)',
    'U.S. population in those counties',
    '(as a % of total U.S population – States + D.C.)',
    'Deaths with known race/ethnicity and county',
    '(as a % of deaths in data)',
    'Access',
    'Limitations'
]
cdc_deaths_metadata = [
    'Weekly',
    'Jan 27, 2021',
    '309K',
    '420K',
    '(74%)',
    '579',
    '(18%)',
    '246K',
    '(76%)',
    '304K',
    '(98%)',
    'Public',
    'Cumulative, counties >= 100 deaths'
]
cdc_cases_metadata = [
    'Monthly', 
    'Dec 16, 2020',
    '239K',
    '299K',
    '(80%)',
    '2,328',
    '(74%)',
    '278M',
    '(86%)',
    '185K',
    '(77%)',
    'Restricted',
    'Data completeness issues'
]
table_data = {'CDC Case Data': cdc_cases_metadata, 'CDC Provisional Deaths': cdc_deaths_metadata}
metadata_df = pd.DataFrame(table_data, index=row_names)
metadata_df.head(15)
Out[34]:
CDC Case Data CDC Provisional Deaths
Update frequency Monthly Weekly
Data as of date Dec 16, 2020 Jan 27, 2021
Deaths in data as of date 239K 309K
Deaths in CTP as of date 299K 420K
(as a % of CTP) (80%) (74%)
Number of counties 2,328 579
(as a % of all counties) (74%) (18%)
U.S. population in those counties 278M 246K
(as a % of total U.S population – States + D.C.) (86%) (76%)
Deaths with known race/ethnicity and county 185K 304K
(as a % of deaths in data) (77%) (98%)
Access Restricted Public
Limitations Data completeness issues Cumulative, counties >= 100 deaths
In [35]:
#@title
# Get the display names for each county.
# Use ACS data that only has one FIPS code per county unlike the fips_county_map.
cdc_provisional_deaths_df = pd.io.gbq.read_gbq(f'''
SELECT
  *
FROM
  `msm-secure-data-1b.ndunlap_secure.cdc_provisional_deaths_20210127`
''', project_id=project_id)

df_acs_name_lookup = pd.io.gbq.read_gbq(f'''
SELECT
  state,
  county,
  county_fips,
  total_pop
FROM
  `msm-internal-data.ipums_acs.acs_2019_5year_county`
''', project_id=project_id)

nyt_counties_query_jan_27 = ('''
SELECT
  county_fips_code,
  deaths as nyt_cases,
FROM `bigquery-public-data.covid19_nyt.us_counties`
WHERE
  date = DATE(2021, 01, 27)  AND
  county_fips_code IS NOT NULL
''')

df_acs_name_lookup.set_index('county_fips', inplace=True)

cdc_provisional_deaths_df['county_fips'] = cdc_provisional_deaths_df.FIPS_Code
cdc_provisional_deaths_df.set_index('county_fips', inplace=True)
cdc_provisional_deaths_df['state_county'] = cdc_provisional_deaths_df.County_Name + ', ' + cdc_provisional_deaths_df.State
cdc_provisional_deaths_df['total_known_cases'] = round((
    cdc_provisional_deaths_df.Non_Hispanic_White.fillna(0) +
    cdc_provisional_deaths_df.Non_Hispanic_Black.fillna(0) +
    cdc_provisional_deaths_df.Non_Hispanic_American_Indian_or_Alaska_Native.fillna(0) +
    cdc_provisional_deaths_df.Non_Hispanic_Asian.fillna(0) +
    cdc_provisional_deaths_df.Non_Hispanic_Native_Hawaiian_or_Other_Pacific_Islander.fillna(0) +
    cdc_provisional_deaths_df.Hispanic.fillna(0)) * cdc_provisional_deaths_df.COVID_19_Deaths, 0)

county_chart_provisional_df = cdc_provisional_deaths_df.join(df_acs_name_lookup, on="county_fips", how='inner', lsuffix='_left', rsuffix='_right')
county_chart_provisional_df.reset_index(inplace=True)
county_chart_provisional_df.county_fips = county_chart_provisional_df.county_fips.astype(int)

#print(len(county_chart_provisional_df))
#print(county_chart_provisional_df.total_pop.sum())
#print(county_chart_provisional_df.total_pop.sum() / 324697795)  # Population covered in these counties
#print(0.55 * 324697795) # NYT population

nyt_counties_provisional_df = pd.io.gbq.read_gbq(nyt_counties_query_jan_27, project_id=project_id)
nyt_counties_provisional_df.rename(columns={'county_fips_code': 'county_fips'}, inplace=True)
nyt_counties_provisional_df.county_fips.unique()
nyt_counties_provisional_df['county_fips'] = nyt_counties_provisional_df.county_fips.astype(int)
nyt_counties_provisional_df.set_index('county_fips', inplace=True)

county_chart_provisional_df.set_index('county_fips', inplace=True)
nyt_merged_provisional_df = county_chart_provisional_df.join(nyt_counties_provisional_df, on="county_fips", how='left', lsuffix='_left', rsuffix='_right')
nyt_merged_provisional_df = nyt_merged_provisional_df.reset_index()
nyt_merged_provisional_df['percent'] = round(nyt_merged_provisional_df.COVID_19_Deaths / nyt_merged_provisional_df.nyt_cases, 4)
nyt_merged_provisional_df['percent_known_cases'] = round(nyt_merged_provisional_df.total_known_cases / nyt_merged_provisional_df.COVID_19_Deaths, 4)
In [36]:
#@title
cdc_nyt_provisional_fields_dict = {
    'x': {'name': 'nyt_cases', 'format': ',', 'title': 'NYT deaths'},
    'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent', 'format': '.2f', 'title': 'Ratio of CDC to NYT'},
}
cdc_nyt_provisional_title = 'Ratio of CDC Provisional Deaths to NYT Deaths by County as of Jan 27'
zoom_cdc_nyt_title = 'Zoom in on counties with up to 3,000 Deaths'

scatter = CreateScatterPlot(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, 16000, scatter_height, scatter_width, 'county', 'ratio'
)
zoom_scatter = CreateScatterPlot(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, zoom_cdc_nyt_title, 3000, scatter_height, scatter_width, 'county', 'ratio'
)

(scatter | zoom_scatter).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [37]:
#@title
cdc_nyt_provisional_map = CreateMap(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, total_cases_scale_max, map_height, map_width, 'county', 'ratio'
)

(cdc_nyt_provisional_map).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

In the map below, we can see that the CDC Provisional deaths data has a strikingly high percentage of deaths with known race/ethnicity. The data includes a total number of deaths from COVID-19 and the percentage of those deaths within different race/ethnicity categories. There is an "Other" category that includes More than one race or Unknown, and we treated that entire category as Unknown race/ethnicity. If a race/ethnicity group had fewer than 10 deaths, they are suppressed for privacy reasons, and we treated those as Unknowns as well.

In [38]:
#@title
cdc_known_county_provisional_fields_dict = {
    'x': {'name': 'total_known_cases', 'format': ',', 'title': 'Known race/ethnicity deaths'},
    'y': {'name': 'COVID_19_Deaths', 'format': ',', 'title': 'CDC deaths'},
    'percent': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'Percent known deaths'},
}
cdc_known_county_provisional_title = 'CDC Provisional Deaths with Known Race/Ethnicity as of %s' % 'Jan 27'
cdc_known_county_provisional_map = CreateMap(
    nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height, map_width, 'county', 'percent'
)

(cdc_known_county_provisional_map).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).display()

We can look at the composite measure of how well each county reports on total deaths from CRDT/NYT and how many of those deaths have race/ethnicity data. In the CDC Provisional Deaths data, almost all counties are in the top right quadrant, which means that they have a mid-high percentage of total deaths and a mid-high percentage of deaths with race/ethnicity.

In [39]:
#@title
nyt_merged_provisional_df['percent_max_100'] = nyt_merged_provisional_df.percent.clip(upper=1)
nyt_merged_provisional_df['percent_reccs'] = nyt_merged_provisional_df.percent_max_100 * nyt_merged_provisional_df.percent_known_cases
county_reccs_provisional_fields_dict = {
    'y': {'name': 'percent_max_100', 'format': '.0%', 'title': 'CDC percent of NYT total deaths'},
    'x': {'name': 'percent_known_cases', 'format': '.0%', 'title': 'CDC percent with race/ethnicity'},
    'percent': {'name': 'percent_reccs', 'format': '.0%', 'title': 'Product: CDC percent of NYT total with race/ethnicity'},
}
county_reccs_provisional_title = state_reccs_title = 'CDC Provisional Deaths: Total Deaths x Race/Ethnicity as of %s' % 'Jan 27'

scatter_provisional = CreateScatterPlotAndMap(
    nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, scatter_height, scatter_width, map_width, 'county', 'percent'
)
scatter_provisional.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Notes:

  • All states or counties with > 100% of the total deaths in the CRDT or NYT data were capped at 100%. Notes:
  • A larger version of the county map for hovering over smaller counties is available in the Appendix.

If you are looking for more reliable county-level deaths data with race/ethnicity, the CDC Provisional Deaths data is more complete than the CDC Case data in many ways, is publicly available, and is updated once a week rather than once a month. However, there are a few tradeoffs:

  • Only 18% of the counties in the U.S. are included in the data, but those counties account for 76% of the U.S. population.
  • You will not get a comprehensive view of COVID-19 deaths across the entire U.S. due to the exclusion of counties with fewer than 100 COVID-19 deaths, which translates to rural areas and counties with small populations. If you aggregate at the U.S. or state level, you will need to be careful to not generalize your findings to the entire U.S. based on data that comes from the most populous counties.
  • The data is cumulative so you will not be able to analyze deaths over time.

Appendix

Large county maps

To make it easier to hover over small counties, here are larger versions of the county maps that appeared in this report.

In [40]:
#@title
cdc_nyt_map = CreateMap(
    nyt_merged_df, cdc_nyt_fields_dict, cdc_nyt_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)
cdc_nyt_map.display()
In [41]:
#@title
cdc_known_county_map = CreateMap(
    chart_df, cdc_known_county_fields_dict, cdc_known_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)

cdc_known_county_map.display()
In [42]:
#@title
cdc_known_or_na_county_map = CreateMap(
    chart_df, cdc_known_or_na_county_fields_dict, cdc_known_or_na_county_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
).configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
)
cdc_known_or_na_county_map.display()
In [43]:
#@title
county_completeness = CreateMap(
    nyt_cdc_known_merged_df, county_reccs_fields_dict, county_reccs_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)
county_completeness.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [44]:
#@title
cdc_nyt_map = CreateMap(
    nyt_merged_provisional_df, cdc_nyt_provisional_fields_dict, cdc_nyt_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'ratio'
)

cdc_nyt_map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [45]:
#@title
cdc_known_county_map = CreateMap(
    nyt_merged_provisional_df, cdc_known_county_provisional_fields_dict, cdc_known_county_provisional_title, total_cases_scale_max, map_height * 2, map_width * 2, 'county', 'percent'
)

cdc_known_county_map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()
In [46]:
#@title
map = CreateMap(
    nyt_merged_provisional_df, county_reccs_provisional_fields_dict, county_reccs_provisional_title, 1, map_height * 2, map_width * 2, 'county', 'percent'
)

map.configure_view(
    strokeWidth=0,
).configure_legend(
    gradientLength=map_height - 50
).configure_mark(
    stroke='grey'
).display()

Data Citations and Disclaimers

  • CDC data full citation: Centers for Disease Control and Prevention, COVID-19 Response. COVID-19 Case Surveillance Data Access, Summary, and Limitations (version date: December 31, 2020).
  • Per the CDC data agreement: The CDC does not take responsibility for the scientific validity or accuracy of methodology, results, statistical analyses, or conclusions presented.
  • Population data: U.S. Census Bureau's American Community Survey 2019 5-year estimates accessed via API; e.g., sample query.
  • Covid Racial Data Tracker data: Available in a public spreadsheet.
  • New York Times data: Available as a public CSV file or via Google Cloud Public Datasets.

Contact information

Please email us at shli-covid-data-analysis@googlegroups.com with questions or comments.

In [47]:
#%%shell
#jupyter nbconvert --to html 'cdc_death_data.ipynb'